This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.
#Import Libraries
library(aplore3)
library(caret)
library(dplyr)
library(tidyverse)
library(car)
require(ggthemes)
library(glmnet)
library(cowplot)
library(GGally)
library(ResourceSelection)
library(ROCR)
library(pROC)
#Import Data
data = glow_bonemed
data = data.frame(data)
dim(data)
[1] 500 18
data$sub_id = as.factor(data$sub_id)
data$site_id = as.factor(data$site_id)
data$phy_id = as.factor(data$phy_id)
#Split train and test set
set.seed(66)
trainIndex <- createDataPartition(data$fracture, p = .8,
list = FALSE,
times = 1)
train <- data[trainIndex,]
test <- data[-trainIndex,]
dim(train)
[1] 400 18
dim(test)
[1] 100 18
All of the EDA will be done in the train data
View head of dataframe
head(train)
Looking at Fracture balance
# look for class imbalance
# The dataset is hevaily imbalance with more No's than Yes
data_classes = data %>% ggplot(aes(x=fracture)) + geom_bar() + theme_fivethirtyeight()
train_classes = train %>% ggplot(aes(x=fracture)) + geom_bar() + theme_fivethirtyeight()
test_classes = test %>% ggplot(aes(x=fracture)) + geom_bar() + theme_fivethirtyeight()
plot_grid(data_classes, train_classes, test_classes, labels = c("Overall Data", "Train Data", "Test Data"))
Let’s look at pair plots from all the numeric variables
train_numeric = train %>% select_if(is.numeric)
pairs(train[,2:8],col=as.factor(train$fracture))
Looking at a different view of pair plots for numerical variables. Excluding id’s
ggpairs(train,columns=4:8,aes(colour=fracture))
Looking at box plot for different numerical variables per fracture or not
boxplot_age = train %>% ggplot(aes(y=age, x=fracture)) + geom_boxplot() + ggtitle("age vs fracture") + theme_fivethirtyeight()
boxplot_weight = train %>% ggplot(aes(y=weight, x=fracture)) + geom_boxplot() + ggtitle("weight vs fracture") + theme_fivethirtyeight()
boxplot_height = train %>% ggplot(aes(y=height, x=fracture)) + geom_boxplot() + ggtitle("height vs fracture") + theme_fivethirtyeight()
boxplot_bmi= train %>% ggplot(aes(y=bmi, x=fracture)) + geom_boxplot() + ggtitle("bmi vs fracture") + theme_fivethirtyeight()
boxplot_fracscore= train %>% ggplot(aes(y=fracscore, x=fracture)) + geom_boxplot() + ggtitle("bmi vs fracture") + theme_fivethirtyeight()
plot_grid(boxplot_age, boxplot_weight, boxplot_height, boxplot_bmi, boxplot_fracscore, nrow=2, ncol=2)
Lets look at bmi vs age per different categorical variables
# relation of bmi and age
age_bim_fracture = train %>% ggplot(aes(x=age, y=bmi, col=fracture)) + geom_point() + geom_smooth(method = 'loess' , formula = 'y ~ x'
) + ggtitle("bmi vs age") + xlab("age") + ylab("bmi") + theme_minimal()
age_bim_premeno = train %>% ggplot(aes(x=age, y=bmi, col=premeno)) + geom_point() + geom_smooth(method = 'loess' , formula = 'y ~ x'
) + ggtitle("bmi vs age") + xlab("age") + ylab("bmi") + theme_minimal()
age_bim_smoke = train %>% ggplot(aes(x=age, y=bmi, col=raterisk)) + geom_point() + geom_smooth(method = 'loess' , formula = 'y ~ x'
) + ggtitle("bmi vs age") + xlab("age") + ylab("bmi") + theme_minimal()
age_bim_raterisk = train %>% ggplot(aes(x=age, y=bmi, col=smoke)) + geom_point() + geom_smooth(method = 'loess' , formula = 'y ~ x'
) + ggtitle("bmi vs age") + xlab("age") + ylab("bmi") + theme_minimal()
plot_grid(age_bim_fracture, age_bim_premeno,age_bim_smoke, age_bim_raterisk, nrow=4, ncol=1)
Lets look at different numerica variables vs categorical variables per site id The point os to investigate if site id had any impact
bmi_frac_type = train %>% ggplot(aes(x=fracture, y=bmi, col=as.factor(site_id))) + geom_boxplot() + ggtitle("BMI for fracture type per site id")
age_frac_type = train %>% ggplot(aes(x=fracture, y=age, col=as.factor(site_id))) + geom_boxplot() + ggtitle("Age for fracture type per site id")
weight_frac_type = train %>% ggplot(aes(x=fracture, y=weight, col=as.factor(site_id))) + geom_boxplot() + ggtitle("Weight for fracture type per site id")
height_frac_type = train %>% ggplot(aes(x=fracture, y=height, col=as.factor(site_id))) + geom_boxplot() + ggtitle("Height for fracture type per site id")
plot_grid(bmi_frac_type, age_frac_type, weight_frac_type,height_frac_type, nrow=2, ncol=2)
#Functions
make_predictions = function(model, test){
fit.pred = predict(model, newdata = test, type = "response")
results = prediction(fit.pred, test$fracture,
label.ordering=c("No","Yes"))
return(results)
}
classification_metrics = function(cutoff, model, model_type) {
fit.pred = predict(model, newdata = test, type = "response")
class<-factor(ifelse(fit.pred>cutoff,"Yes","No"),levels=c("No","Yes"))
#Confusion Matrix for Lasso
conf<-table(class,test$fracture)
print(paste("Confusion matrix for ", model_type))
print(conf)
precision <- posPredValue(class, test$fracture, positive="Yes")
recall <- sensitivity(class, test$fracture, positive="Yes")
F1 <- (2 * precision * recall) / (precision + recall)
print(paste("accuracy = ", round(mean(class==test$fracture) ,3), sep = ""))
print(paste("precision = ", round(precision ,3), sep = ""))
print(paste("recall = ", round(precision ,3), sep = ""))
print(paste("F1 = ", round(F1 ,3), sep = ""))
}
roc_metrics = function(pred_results){
roc = performance(pred_results, measure = "tpr", x.measure = "fpr")
return(roc)
}
auc_metrics = function(pred_results) {
auc <- performance(pred_results , measure = "auc")
auc <- auc@y.values
return(auc)
}
plot_roc = function (model_type, pred_results,x,y,c, ...) {
roc = roc_metrics(pred_results)
auc = auc_metrics(pred_results)
plot(roc, colorize = c, ...)
abline(a=0, b= 1)
text(x = x, y = y, paste(model_type," AUC = ", round(auc[[1]],3), sep = ""))
}
Lets train an interpretable logistic regression using the lasso technique The point of this model is to be interpretable, meaning no exotic variables such as iteraction terms
train.x <- model.matrix(fracture~ priorfrac + age + weight + height + bmi + premeno + momfrac + armassist + smoke+ raterisk + fracscore + bonemed + bonemed_fu + bonetreat, train)
train.y<-train[,15]
nFolds = 10
set.seed(3)
foldid = sample(rep(seq(nFolds), length.out = nrow(train.x)))
lambdas_to_try <- 10^seq(-3, 5, length.out = 2000)
set.seed(3)
cvfit = cv.glmnet(train.x, train.y,
family = "binomial",
type.measure = "class",
lambda = lambdas_to_try,
nfolds = nFolds,
foldid = foldid)
plot(cvfit)
coef(cvfit, s = "lambda.min")
17 x 1 sparse Matrix of class "dgCMatrix"
1
(Intercept) 3.51060063
(Intercept) .
priorfracYes 0.25344453
age .
weight .
height -0.04300955
bmi 0.03361026
premenoYes 0.42972762
momfracYes 0.65126307
armassistYes 0.01552712
smokeYes -0.63068135
rateriskSame 0.20520092
rateriskGreater 0.40514121
fracscore 0.17376244
bonemedYes 0.97742606
bonemed_fuYes 1.25863914
bonetreatYes -1.67321604
print("CV Error Rate:")
[1] "CV Error Rate:"
cvfit$cvm[which(cvfit$lambda==cvfit$lambda.min)]
[1] 0.2425
#Optimal penalty
print("Penalty Value:")
[1] "Penalty Value:"
cvfit$lambda.min
[1] 0.003437701
build a final interpretable model based on feature selection and lambda value selected above
#For final model predictions go ahead and refit lasso using entire
#data set
#finalmodel = glmnet(train.x, train.y, family = "binomial",lambda=cvfit$lambda.min)
finalmodel<-glm(fracture ~ priorfrac + height + bmi + premeno + momfrac + armassist +
smoke + raterisk + raterisk + fracscore + bonemed +
bonemed_fu + bonetreat
, data=train,family = binomial(link="logit"))
coef(finalmodel)
(Intercept) priorfracYes height bmi premenoYes momfracYes armassistYes
4.22659120 0.29094012 -0.04895870 0.03675682 0.50380454 0.74656389 0.03859271
smokeYes rateriskSame rateriskGreater fracscore bonemedYes bonemed_fuYes bonetreatYes
-0.72536163 0.28610706 0.48088719 0.17418554 1.80415734 1.60024489 -2.83755399
confint(finalmodel)
Waiting for profiling to be done...
2.5 % 97.5 %
(Intercept) -3.125951306 11.772617251
priorfracYes -0.342350829 0.919089346
height -0.093104728 -0.006631961
bmi -0.013081295 0.086549589
premenoYes -0.133317758 1.125778521
momfracYes 0.021109823 1.458287792
armassistYes -0.718417771 0.793018497
smokeYes -2.037461419 0.347846838
rateriskSame -0.351943896 0.936948811
rateriskGreater -0.207341148 1.179412406
fracscore 0.005433062 0.345608202
bonemedYes 0.172258819 3.529248552
bonemed_fuYes 0.536353147 2.720780709
bonetreatYes -4.879979303 -0.882501113
summary(finalmodel)
Call:
glm(formula = fracture ~ priorfrac + height + bmi + premeno +
momfrac + armassist + smoke + raterisk + raterisk + fracscore +
bonemed + bonemed_fu + bonetreat, family = binomial(link = "logit"),
data = train)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.77904 -0.73822 -0.51798 0.00557 2.29353
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 4.22659 3.78738 1.116 0.26444
priorfracYes 0.29094 0.32092 0.907 0.36463
height -0.04896 0.02198 -2.227 0.02594 *
bmi 0.03676 0.02533 1.451 0.14675
premenoYes 0.50380 0.31990 1.575 0.11528
momfracYes 0.74656 0.36506 2.045 0.04085 *
armassistYes 0.03859 0.38457 0.100 0.92006
smokeYes -0.72536 0.59527 -1.219 0.22302
rateriskSame 0.28611 0.32751 0.874 0.38234
rateriskGreater 0.48089 0.35246 1.364 0.17246
fracscore 0.17419 0.08653 2.013 0.04412 *
bonemedYes 1.80416 0.82615 2.184 0.02898 *
bonemed_fuYes 1.60024 0.55016 2.909 0.00363 **
bonetreatYes -2.83755 1.00127 -2.834 0.00460 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 449.87 on 399 degrees of freedom
Residual deviance: 387.65 on 386 degrees of freedom
AIC: 415.65
Number of Fisher Scoring iterations: 4
(vif(finalmodel)[,3])^2
priorfrac height bmi premeno momfrac armassist smoke raterisk fracscore bonemed bonemed_fu
1.480680 1.168059 1.513558 1.110514 1.133557 2.296923 1.043240 1.114999 2.865624 9.322542 4.360509
bonetreat
13.081331
vif(finalmodel)
GVIF Df GVIF^(1/(2*Df))
priorfrac 1.480680 1 1.216832
height 1.168059 1 1.080768
bmi 1.513558 1 1.230268
premeno 1.110514 1 1.053809
momfrac 1.133557 1 1.064686
armassist 2.296923 1 1.515560
smoke 1.043240 1 1.021391
raterisk 1.243223 2 1.055935
fracscore 2.865624 1 1.692815
bonemed 9.322542 1 3.053284
bonemed_fu 4.360509 1 2.088183
bonetreat 13.081331 1 3.616812
plot(finalmodel)
lets look at predictions for the lasso model also looking at the roc plot to select the most optimal threhold for classification
## removed sub_id
test.x = model.matrix(fracture~site_id+phy_id+priorfrac+age+weight+height+bmi+premeno+momfrac+armassist+ smoke+raterisk + fracscore +bonemed+bonemed_fu+bonetreat, test)
hoslem.test(finalmodel$y, fitted(finalmodel), g=10)
Hosmer and Lemeshow goodness of fit (GOF) test
data: finalmodel$y, fitted(finalmodel)
X-squared = 10.259, df = 8, p-value = 0.2473
There is a large p-value so the test is a fit
preds_lasso = make_predictions(finalmodel, test)
plot_roc("Lasso",preds_lasso,0.2,0.7,T)
lets look at model performance metrics
classification_metrics(0.3, finalmodel, "Lasso")
[1] "Confusion matrix for Lasso"
class No Yes
No 54 13
Yes 21 12
[1] "accuracy = 0.66"
[1] "precision = 0.364"
[1] "recall = 0.364"
[1] "F1 = 0.414"
library(leaps)
nvmax = 15
reg_sq=regsubsets(fracture~.-sub_id-site_id-phy_id,data=train, method="seqrep", nvmax=nvmax)
par(mfrow=c(2,2))
cp<-summary(reg_sq)$cp
plot(1:(nvmax),cp,type="l",ylab="CP",xlab="# of predictors")
index<-which(cp==min(cp))
points(index,cp[index],col="red",pch=10)
bics<-summary(reg_sq)$bic
plot(1:(nvmax),bics,type="l",ylab="BIC",xlab="# of predictors")
index<-which(bics==-0.05839447)
points(index,bics[index],col="red",pch=10)
adjr2<-summary(reg_sq)$adjr2
plot(1:(nvmax),adjr2,type="l",ylab="Adjusted R-squared",xlab="# of predictors")
index<-which(adjr2==max(adjr2))
points(index,adjr2[index],col="red",pch=10)
rss<-summary(reg_sq)$rss
plot(1:(nvmax),rss,type="l",ylab="train RSS",xlab="# of predictors")
index<-which(rss==min(rss))
points(index,rss[index],col="red",pch=10)
cbind(CP=summary(reg_sq)$cp,
r2=summary(reg_sq)$rsq,
Adj_r2=summary(reg_sq)$adjr2,
BIC=summary(reg_sq)$bic,
RSS = summary(reg_sq)$rss)
CP r2 Adj_r2 BIC RSS
[1,] 31.509069 0.06228673 0.05993067 -13.74149469 70.32850
[2,] 22.835003 0.08569960 0.08109355 -17.86404376 68.57253
[3,] 18.712500 0.09912891 0.09230413 -17.79138417 67.56533
[4,] 16.348421 0.10870123 0.09967542 -16.07291365 66.84741
[5,] 13.861859 0.11854221 0.10735620 -14.52247919 66.10933
[6,] 10.898890 0.12942816 0.11613699 -13.50174767 65.29289
[7,] 9.329610 0.13725715 0.12185102 -11.12372350 64.70571
[8,] 7.927299 0.14471989 0.12722056 -8.60732019 64.14601
[9,] 26.449361 0.10847983 0.08790628 13.98375977 66.86401
[10,] 8.594098 0.15203105 0.13023236 -0.05839447 63.59767
[11,] 9.317336 0.15483155 0.13087058 4.60984770 63.38763
[12,] 10.654157 0.15628619 0.13012452 9.91226888 63.27854
[13,] 25.997306 0.12701886 0.09761794 29.54397389 65.47359
[14,] 24.465667 0.13476528 0.10330220 31.97018662 64.89260
[15,] 16.000000 0.15772104 0.12481951 27.20582917 63.17092
coef(reg_sq, 10)
(Intercept) priorfracYes weight bmi premenoYes momfracYes smokeYes fracscore bonemedYes
0.896192259 0.074990114 -0.009135076 0.029519214 0.078337622 0.147229030 -0.100225591 0.027287755 0.372353460
bonemed_fuYes bonetreatYes
0.356825487 -0.614896361
#To deal with the redundamcy, I would throw the cylinder variable out and then see what happens
model.main<-glm(fracture ~priorfrac+weight+bmi+premeno+momfrac+smoke+fracscore+bonemed+bonemed_fu+bonetreat, data=train,family = binomial(link="logit"))
summary(model.main)
Call:
glm(formula = fracture ~ priorfrac + weight + bmi + premeno +
momfrac + smoke + fracscore + bonemed + bonemed_fu + bonetreat,
family = binomial(link = "logit"), data = train)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.68327 -0.74758 -0.51429 -0.00849 2.32662
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -3.42910 0.73446 -4.669 3.03e-06 ***
priorfracYes 0.37714 0.30346 1.243 0.21394
weight -0.05476 0.02293 -2.388 0.01695 *
bmi 0.17836 0.06172 2.890 0.00385 **
premenoYes 0.53354 0.31491 1.694 0.09022 .
momfracYes 0.81220 0.35193 2.308 0.02101 *
smokeYes -0.69295 0.59151 -1.171 0.24140
fracscore 0.17217 0.06114 2.816 0.00487 **
bonemedYes 1.87128 0.83048 2.253 0.02424 *
bonemed_fuYes 1.73253 0.53628 3.231 0.00124 **
bonetreatYes -2.93248 1.00520 -2.917 0.00353 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 449.87 on 399 degrees of freedom
Residual deviance: 388.89 on 389 degrees of freedom
AIC: 410.89
Number of Fisher Scoring iterations: 4
exp(cbind("Odds ratio" = coef(model.main), confint.default(model.main, level = 0.95)))
Odds ratio 2.5 % 97.5 %
(Intercept) 0.03241615 0.007684079 0.1367511
priorfracYes 1.45811288 0.804427013 2.6429908
weight 0.94671625 0.905106185 0.9902392
bmi 1.19525393 1.059076423 1.3489413
premenoYes 1.70495430 0.919722940 3.1605922
momfracYes 2.25286510 1.130235590 4.4905692
smokeYes 0.50010048 0.156877764 1.5942380
fracscore 1.18787740 1.053723165 1.3391114
bonemedYes 6.49663181 1.275805335 33.0820257
bonemed_fuYes 5.65495838 1.976724475 16.1775476
bonetreatYes 0.05326457 0.007426895 0.3820054
vif(model.main)
priorfrac weight bmi premeno momfrac smoke fracscore bonemed bonemed_fu bonetreat
1.330137 9.180248 8.988158 1.075832 1.064687 1.038839 1.426069 9.429683 4.153687 13.202319
#Residual diagnostics can be obtained using
plot(model.main)
preds_step = make_predictions(model.main, test)
plot_roc("Step",preds_step,0.2,0.8,T)
lets look at model performance metrics
classification_metrics(0.22, model.main, "Step")
[1] "Confusion matrix for Step"
class No Yes
No 46 10
Yes 29 15
[1] "accuracy = 0.61"
[1] "precision = 0.341"
[1] "recall = 0.341"
[1] "F1 = 0.435"
plot_roc("Step",preds_step,0.2,0.8,F, col="red")
par(new=T)
plot_roc("Lasso",preds_lasso,0.2,0.7,F, col="blue")
legend(0.7, 0.3,legend = c("Step","Lasso"), fill=c("red","blue"))